<!DOCTYPE html>
<html lang="zh">
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
        <title>机器学习 | ChenDong Zhu&#39;s Blog</title><meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="robots" content="noodp" />
<meta name="Description" content="学习机器学习的笔记"><link rel="prev" href="/2021/09/%E8%BD%AF%E4%BB%B6%E5%AE%89%E5%85%A8/" /><link rel="canonical" href="/2021/10/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/" />
<link rel="shortcut icon" type="image/x-icon" href="/favicon.ico" />
<link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png">
<link rel="manifest" href="/site.webmanifest">
<link rel="mask-icon" href="/safari-pinned-tab.svg" color="#5bbad5">
<meta name="msapplication-TileColor" content="#da532c">
<meta name="theme-color" content="#ffffff"><meta property="og:title" content="机器学习" />
<meta property="og:description" content="学习机器学习的笔记" />
<meta property="og:type" content="article" />
<meta property="og:url" content="/2021/10/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/" /><meta property="article:section" content="posts" />
<meta property="article:published_time" content="2021-10-01T09:00:00+08:00" />
<meta property="article:modified_time" content="2021-10-01T09:00:00+08:00" />

<meta name="twitter:card" content="summary"/>
<meta name="twitter:title" content="机器学习"/>
<meta name="twitter:description" content="学习机器学习的笔记"/>
<script type="application/ld+json">
    {
        "@context": "http://schema.org",
        "@type": "BlogPosting",
        "headline": "机器学习",
        "mainEntityOfPage": {
            "@type": "WebPage",
            "@id": "\/2021\/10\/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0\/"
        },"image": {
                "@type": "ImageObject",
                "url": "\/cover.png",
                "width":  800 ,
                "height":  600 
            },"genre": "posts","keywords": "机器学习","wordcount":  1829 ,
        "url": "\/2021\/10\/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0\/","datePublished": "2021-10-01T09:00:00\u002b08:00","dateModified": "2021-10-01T09:00:00\u002b08:00","license": "This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.","publisher": {
                "@type": "Organization",
                "name": "xxxx",
                "logo": {
                "@type": "ImageObject",
                "url": "\/logo.png",
                "width":  127 ,
                "height":  40 
                }
            },"description": "学习机器学习的笔记"
    }
    </script><link rel="stylesheet" href="/css/style.min.css"><link rel="stylesheet" href="/css/lib/fontawesome-free/all.min.css"><link rel="stylesheet" href="/css/lib/animate/animate.min.css"></head>
    <body><script>
            window.isDark = (window.localStorage && window.localStorage.getItem('theme')) === 'dark';
            window.isDark && document.body.classList.add('dark-theme');
        </script><div class="wrapper"><nav class="navbar">
    <div class="navbar-container">
        <div class="navbar-header animated bounceIn">
            <a href="/">ChenDong Zhu&#39;s Blog</a>
        </div>
        <div class="navbar-menu"><a class="menu-item" href="/posts" title="">文章</a><a class="menu-item" href="/tags" title="">标签</a><a class="menu-item" href="/categories" title="">分类</a><a class="menu-item" href="/about" title="">关于</a><a class="menu-item" href="https://hugo-loveit-en.netlify.com" title="English"><i class="fas fa-language fa-fw"></i></a><a href="javascript:void(0);" class="theme-switch"><i class="fas fa-adjust fa-rotate-180 fa-fw" title="切换主题"></i></a>
        </div>
    </div>
</nav><nav class="navbar-mobile">
    <div class="navbar-container">
        <div class="navbar-header">
            <div class="navbar-header-title animated bounceIn">
                <a href="/">ChenDong Zhu&#39;s Blog</a>
            </div>
            <div class="menu-toggle" id="menu-toggle">
                <span></span><span></span><span></span>
            </div>
        </div>
        <div class="navbar-menu" id="mobile-menu"><a class="menu-item" href="/posts" title="">文章</a><a class="menu-item" href="/tags" title="">标签</a><a class="menu-item" href="/categories" title="">分类</a><a class="menu-item" href="/about" title="">关于</a><a class="menu-item" href="https://hugo-loveit-en.netlify.com" title="English"></a><a href="javascript:void(0);" class="theme-switch"><i class="fas fa-adjust fa-rotate-180 fa-fw" title="切换主题"></i></a>
        </div>
    </div>
</nav><main class="main">
                <div class="container"><article class="page"><h1 class="post-title animated flipInX">机器学习</h1><div class="post-meta">
            <div class="post-meta-main"><a class="author" href="/" rel="author" target="_blank">
                    <i class="fas fa-user-circle fa-fw"></i>ChenDong Zhu
                </a>&nbsp;<span class="post-category">收录于&nbsp;<i class="far fa-folder fa-fw"></i><a href="/categories/%E5%AD%A6%E4%B9%A0/">学习</a>&nbsp;</span></div>
            <div class="post-meta-other"><i class="far fa-calendar-alt fa-fw"></i><time datetime=2021-10-01>2021-10-01</time>&nbsp;
                <i class="fas fa-pencil-alt fa-fw"></i>约 1829 字&nbsp;
                <i class="far fa-clock fa-fw"></i>预计阅读 4 分钟&nbsp;</div>
        </div><div class="post-toc" id="post-toc">
                <h2 class="post-toc-title">目录</h2>
                <div class="post-toc-content"><nav id="TableOfContents">
  <ul>
    <li><a href="#ch-2-模型评估与选择">Ch 2 模型评估与选择</a>
      <ul>
        <li><a href="#ch-22-评估方法">Ch 2.2 评估方法</a></li>
        <li><a href="#ch-23-性能评估">Ch 2.3 性能评估</a></li>
      </ul>
    </li>
  </ul>
</nav></div>
            </div>
            <div class="post-toc-mobile" id="post-toc-mobile">
                <details>
                    <summary>
                        <div class="post-toc-title">
                            <span>目录</span>
                            <span><i class="details icon fas fa-angle-down"></i></span>
                        </div>
                    </summary>
                    <div class="post-toc-content"><nav id="TableOfContentsMobile">
  <ul>
    <li><a href="#ch-2-模型评估与选择">Ch 2 模型评估与选择</a>
      <ul>
        <li><a href="#ch-22-评估方法">Ch 2.2 评估方法</a></li>
        <li><a href="#ch-23-性能评估">Ch 2.3 性能评估</a></li>
      </ul>
    </li>
  </ul>
</nav></div>
                </details>
            </div><div class="post-content"><a class="post-dummy-target" id="ch-2-模型评估与选择"></a><h2>Ch 2 模型评估与选择</h2>
<p>对于模型来说，需要有个度量模型好坏的方法，正如之前第二届龙芯杯南京大学队弄出来的差分验证框架，这种度量、验证是很重要的。在这一章中，ch2.1类似导言，ch2.2是介绍了How，即怎样进行模型评估，ch2.3介绍了What，使用什么指标，ch2.4介绍了统计学意义上对指标如何比较（不能使用单纯比大小的方法，需要从统计学意义上考虑）</p>
<a class="post-dummy-target" id="ch-22-评估方法"></a><h3>Ch 2.2 评估方法</h3>
<p><strong>留出法</strong></p>
<p>就是数据集划分成两个互斥的集合，一个为训练集，一个为测试集（划分需要尽量保证数据分布的一致性）</p>
<p><strong>交叉验证（k折交叉验证）</strong></p>
<p>就是把数据划分$k$份，其中1份作为测试集，剩下的用来训练，由于把数据集分为$k$份存在很多分法，所以会取多轮$k$折交叉验证的均值，该方法在$k$等于样本数时会衰退为特例（留一法），留一法在数据量大的情况下，需要训练的太多。</p>
<p><strong>自助法</strong></p>
<p>留出法和交叉验证共有的问题在于，其见减小了数据规模，会导致一定的估计偏差，自助法每次从数据集$D$中复制一个样本到$D^*$中，复制$M$次($M$为样本数)，</p>
<p>这就保证了两个数据集规模上相等，当时这种方法改变了数据的分布，一般数据量小时候使用。</p>
<a class="post-dummy-target" id="ch-23-性能评估"></a><h3>Ch 2.3 性能评估</h3>
<p>该章后半部分参考<a href="https://www.zhihu.com/question/63492375/answer/1420670463" target="_blank">知乎答案</a>，这个知乎答案对于<strong>代价曲线</strong>有这比较详细的论述。</p>
<p><strong>均方误差</strong></p>
<p>回归任务常用的性能度量
$$
E(f;D) = \frac{1}{m}\sum^{m}<em>{i=1}(f(x_i)-y_i)^2
$$
更一般性的，对于数据分布$\cal{D}$和概率密度函数$p(*)$,均方误差可描述为
$$
E(f;\cal{D}) = \int</em>{x\sim\cal{D}}(f(x)-y)^2p(x)dx
$$
<strong>错误率与精度</strong></p>
<p>错误率
$$
E(f;D)=\frac{1}{m}\sum_{i=1}^{m}\mathbb{I}(f(x_i)\neq y_i)
$$
精度
$$
acc(f;D) = \frac{1}{m}\mathbb{I}(f(x_i)=y_i)\<br>
=1-E(f;D)
$$
更一般性的对于数据分布$\cal{D}$和概率密度函数$p(*)$，这里就不推导了</p>
<p><strong>查准率、查全率和$F1$</strong></p>
<p>分类结果有4类，TP(true position),TN(true negative),FP(false position),FN(false negative)</p>
<p>对于查准率$P$和查全率$R$定义为：
$$
P=\frac{TP}{TP+FP}\<br>
R=\frac{TP}{TP+FN}
$$
<strong>P-R图</strong></p>
<p><figure><img src="/svg/loading.min.svg" data-sizes="auto" data-src="/images/ML/figure1.png" alt="P-R图" class="lazyload"><figcaption class="image-caption">P-R图</figcaption></figure></p>
<p><font color="red">WARNNING:</font>西瓜书里的P-R图为了好看画成这样的，实际上的P-R图是非单调的，不平滑的</p>
<p>P-R图的绘制：根据学习器的预测结果对样例进行排序，排在前面的是学习器认为“最有可能”是正例的样本，按这个顺序对样本作为正例进行预测，每次可以计算出当前的查全率和查准率，以查准率为纵轴，以查全率为横轴作图，就得到了P-R曲线</p>
<p><strong>平横点(Break-Even Point)</strong></p>
<p>对于学习器A和B人们希望用P-R评个好坏，但是往往不会形成学习器A完虐学习器B的情况，所以用BEP进行度量，BEP是查准率等于查全率时候的取值（<font color="red">会不会存在多个点查准率等于查全率呢?</font>）</p>
<p>**$F1$度量和$F_{\beta}$度量**</p>
<p>$F1$度量是查准率和查全率的调和平均
$$
\frac{1}{F1}=\frac{1}{2}(\frac{1}{P}+\frac{1}{R})
$$
但是，通常情况查准率和查全率不是同等重要的，比如找逃犯的时候查准率低一点没事，但是查全率要高，这种情况下使用$F_{\beta}$
$$
\frac{1}{F_{\beta}} = \frac{1}{1+{\beta}^2}(\frac{1}{P}+\frac{\beta^2}{R})
$$
且$\beta$是查全率对查准率的重要程度。</p>
<p><strong>ROC和AUC</strong></p>
<p>这个就需要结合上面的知乎回答理解了。</p>
<p>学习器分类是有阈值的，即大于阈值的判定为正类，小于阈值的判定为负类，那么怎么判定这个阈值呢？且<font color="red">阈值的判定会影响查准率和查全率这些指标</font>，在模型的判断中我们怎么去除阈值的影响呢？</p>
<p>ROC的横轴是&quot;假正例率&quot;，纵轴是“真正例率”,具体定义见书P33,图片如下：</p>
<p><figure><img src="/svg/loading.min.svg" data-sizes="auto" data-src="/images/ML/figure2.png" alt="ROC例子" class="lazyload"><figcaption class="image-caption">ROC例子</figcaption></figure></p>
<p>AUC是ROC曲线下各部分的面积求和，AUC考虑的是样本预测的排序质量具体解释看P36</p>
<p><strong>代价敏感错误率与代价曲线</strong></p>
<p>不同的错误，可能代价不一样，上面提到的都是均等代价的，可能把0类误判成1类的后果比把1类误判成0类更大，在非均等代价下使用CC(代价曲线)</p>
<p>首先我们不知道真实的正例和反例的分布情况，我们获得的数据和真实情况是有偏差的！所以我们设正例概率为$p$,对于无/等代价的CC空间，横纵轴为$p*FNR+(1-p)*FPR$和正例概率$p$。</p>
<p>ROC上面每个点，到这里变成了一根线。（更加直观了）</p>
<p><figure><img src="/svg/loading.min.svg" data-sizes="auto" data-src="/images/ML/figure3.jpg" alt="分布与阈值" class="lazyload"><figcaption class="image-caption">分布与阈值</figcaption></figure></p>
<p>可以看到，对于0类的分布和1类的分布，总是会存在交叉的，这一块的分数，可能是0类，也可能是1类，阈值就是中间这个黑线，这是最好的，但是对于非均等代价，如图</p>
<p><figure><img src="/svg/loading.min.svg" data-sizes="auto" data-src="/images/ML/figure3.jpg" alt="非均等代价下的情况" class="lazyload"><figcaption class="image-caption">非均等代价下的情况</figcaption></figure></p>
<p>这种情况下，两条曲线都因为$cost_{01}$和$cost_{10}$有了放缩，这会影响到中间的灰色区域及其阈值，对模型的评价不利，这种情况下我们要进行归一化处理，消解这个放缩导致的影响（这就是西瓜书归一化的由来）</p>
<p>按我理解，CC空间所有直线下面的灰色区间，其实就是两个分布重合的灰色区间，一个好的模型能尽量分开两个分布，使中间的灰色区域更小，也就是让西瓜书里面p37那个图灰色区域更小，即学习器更好，更能分开两个类别。</p>
</div><div class="post-footer" id="post-footer">
    <div class="post-info">
        <div class="post-info-line">
            <div class="post-info-mod">
                <span>本文于 2021-10-01 更新</span>
            </div>
            <div class="post-info-license"></div>
        </div>
        <div class="post-info-line">
            <div class="post-info-md"></div>
            <div class="post-info-share"><span><a href="//twitter.com/share?url=%2f2021%2f10%2f%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0%2f&amp;text=%e6%9c%ba%e5%99%a8%e5%ad%a6%e4%b9%a0&amp;via=" target="_blank" title="分享到 Twitter">
            <i class="fab fa-twitter fa-fw"></i>
        </a><a href="//www.facebook.com/sharer/sharer.php?u=%2f2021%2f10%2f%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0%2f" target="_blank" title="分享到 Facebook">
            <i class="fab fa-facebook-square fa-fw"></i>
        </a><a href="//reddit.com/submit?url=%2f2021%2f10%2f%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0%2f&amp;title=%e6%9c%ba%e5%99%a8%e5%ad%a6%e4%b9%a0" target="_blank" title="分享到 Reddit">
            <i class="fab fa-reddit fa-fw"></i>
        </a><a href="//service.weibo.com/share/share.php?url=%2f2021%2f10%2f%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0%2f&amp;appkey=&amp;title=%e6%9c%ba%e5%99%a8%e5%ad%a6%e4%b9%a0" target="_blank" title="分享到 Weibo">
            <i class="fab fa-weibo fa-fw"></i>
        </a></span></div>
        </div>
    </div>

    <div class="post-info-more">
        <section><span class="tag">
                        <a href="/tags/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/"><i class="fas fa-tag fa-fw"></i>&nbsp;机器学习</a>&nbsp;
                    </span></section>
        <section>
            <span><a href="javascript:window.history.back();">返回</a></span>&nbsp;|&nbsp;<span><a href="/">主页</a></span>
        </section>
    </div>

    <div class="post-nav"><a href="/2021/09/%E8%BD%AF%E4%BB%B6%E5%AE%89%E5%85%A8/" class="prev" rel="prev" title="软件安全安全相关文献阅读笔记"><i class="fas fa-angle-left fa-fw"></i>软件安全安全相关文献阅读笔记</a></div>
</div><div class="post-comment"></div>
    </article></div>
            </main><footer class="footer">
    <div class="copyright"><div class="copyright-line">由 <a href="https://gohugo.io/" target="_blank" rel="external nofollow noopener noreffer">Hugo</a> 强力驱动 | 主题 - <a href="https://github.com/dillonzq/LoveIt" target="_blank" rel="external nofollow noopener noreffer">LoveIt<i class="far fa-heart fa-fw"></i></a>
        </div>

        <div class="copyright-line"><i class="far fa-copyright fa-fw"></i><span itemprop="copyrightYear">2020 - 2021</span><span class="author" itemprop="copyrightHolder">&nbsp;<a href="/" target="_blank">ChenDong Zhu</a></span>&nbsp;|&nbsp;<span class="license"><a rel="license external nofollow noopener noreffer" href="https://creativecommons.org/licenses/by-nc/4.0/" target="_blank">CC BY-NC 4.0</a></span><span class="icp-splitter">&nbsp;|&nbsp;</span><br class="icp-br"/>
                <span class="icp"><a href="http://beian.miit.gov.cn">苏ICP备2021012540号</a></span></div>
    </div>
</footer></div><a href="#" class="dynamic-to-top" id="dynamic-to-top" data-scroll>
            <span>&nbsp;</span>
        </a><script src="/js/lib/jquery/jquery.slim.min.js"></script><script src="/js/lib/lazysizes/lazysizes.min.js"></script><script src="/js/lib/smooth-scroll/smooth-scroll.polyfills.min.js"></script><script>window.scroll = new SmoothScroll('[data-scroll]', {speed: 300, speedAsDuration: true});</script><link rel="stylesheet" href="/css/lib/katex/katex.min.css"><script src="/js/lib/katex/katex.min.js"></script><script defer src="/js/lib/katex/auto-render.min.js"></script><link rel="stylesheet" href="/css/lib/katex/copy-tex.min.css"><script defer src="/js/lib/katex/copy-tex.min.js"></script><script defer src="/js/lib/katex/mhchem.min.js"></script><script>
        document.addEventListener("DOMContentLoaded", function () {
            renderMathInElement(document.body, {
                delimiters: [
                    { left: "$$", right: "$$", display: true },
                    { left: "\\(", right: "\\)", display: false },
                    { left: "\\[", right: "\\]", display: true },{ left: "$", right: "$", display: false },]
            });
        });
    </script>
    <script src="https://eqcn.ajz.miesnfu.com/wp-content/plugins/wp-3d-pony/live2dw/lib/L2Dwidget.min.js"></script>

    <script>
        var sUserAgent = navigator.userAgent.toLowerCase();
        if (/ipad|iphone|midp|rv:1.2.3.4|ucweb|android|windows ce|windows mobile/.test(sUserAgent)){
            
        }
        else{
            L2Dwidget.init({
                
                "model": {
                    
                    
                    
                    
                    
                    
                    
                    
                    
                    
                    
                    
                    
                    
                    
                    
                                
                    jsonPath: "/data/Shinomiya_Kaguya/model.model.json",
                    "scale": 1
                },
                
                "display": {
                    "position": "left",
                    "width": 150,
                    "height": 300,
                    "hOffset": 0,
                    "vOffset": 0
                },
                "mobile": {
                    "show": true,
                    "scale": 0.5
                },
                "react": {
                    "opacityDefault": 1,
                    "opacityOnHover": 0.2
                }
            });
                function changebottom() {
                            var canvas_temp = document.getElementById("live2dcanvas");
            canvas_temp.style="position: fixed; opacity: 1; left: 0px; bottom: -100px; z-index: 99999; pointer-events: none;";
                }

            window.onload=changebottom;
        }


    </script><script src="/js/blog.min.js"></script>
</body>
</html>